Andrew Stewart, Division of Neuroscience and Experimental Psychology.

email: andrew.stewart@manchester.ac.uk

twitter: @ajstewart_lang

First load the tidyverse and gganimate packages.

library(tidyverse)
library(gganimate)

Load the packages we need for our linear mixed models.

library(lme4)
library(lmerTest)
library(emmeans)

Load the data.

data <- read_csv("fpfp0.ixs")

Labelling for each condition is as follows: C1 = Neutral condition C2 = Negative condition C3 = Positive condition

Relabel the Condition variable (“cond”) and set as.factor()

data$cond <- recode (data$cond, "C1"="Neutral", "C2"="Negative", "C3"="Positive")
data$cond <- as.factor(data$cond)

Filter so we only have the Region 3 data - this is our critical analysis region. Ignore any zero reading times - these will be tracker loss/missing data.

data_R3 <- filter (data, reg=="R3" & DV > 0)

data_R3
## # A tibble: 572 x 7
##    subj  item  cond       seq meas  reg      DV
##    <chr> <chr> <fct>    <int> <chr> <chr> <int>
##  1 S1    I1    Neutral     60 FP    R3      867
##  2 S1    I2    Positive    44 FP    R3     1061
##  3 S1    I3    Negative    30 FP    R3      771
##  4 S1    I4    Neutral     15 FP    R3      626
##  5 S1    I5    Positive    55 FP    R3     1283
##  6 S1    I6    Negative    18 FP    R3      846
##  7 S1    I7    Neutral     32 FP    R3      547
##  8 S1    I8    Positive    70 FP    R3     1135
##  9 S1    I9    Negative    37 FP    R3     1254
## 10 S1    I10   Neutral      1 FP    R3      926
## # ... with 562 more rows

Let’s visualise the data for each of the 3 conditions.

ggplot (data_R3, aes(x=DV)) + geom_histogram() + facet_wrap(~cond)

Let’s look at the data on a Participant by Participant basis.

ggplot (data_R3, aes (x=cond, y=DV, colour=cond)) + geom_jitter(width=.1, alpha=.5) + 
  stat_summary(fun.y=mean, geom="point", colour="black") +
  facet_wrap(~subj) +
  scale_color_discrete(guide=FALSE) + 
  labs (y="Reading Time (msec.)", x="Condition", 
        title="Critical Region, First Pass Reading Times Facted Wrapped by Participant")

And via gganimate.

ggplot (data_R3, aes (x=cond, y=DV, colour=cond)) + geom_jitter(width=.2, alpha=.8, size=3) + 
  stat_summary(fun.y=mean, geom="point", colour="black", size=5) +
  scale_color_discrete(guide=FALSE) + 
  labs (y="Reading Time (msec.)", x="Condition", 
        title="Critical Region, First Pass Reading Times \nAnimated by Participant {(closest_state)}") +
  theme(text=element_text(size=15)) +
  transition_states(subj, transition_length = 1, state_length = 1)  + 
  enter_fade() + 
  exit_fade() 

Let’s work out for how many participants the effect went in the direction as predicted. First we’re aggregating by participants.

data_agg <- data_R3 %>% group_by(subj, cond) %>% summarise (mean=mean(DV), sd=sd(DV))

For how many people was the Negative condition faster than the Positive?

sum (data_agg[data_agg$cond=="Negative",]$mean < data_agg[data_agg$cond=="Positive",]$mean)
## [1] 18

For how many people was the Negative condition faster than the Neutral?

sum (data_agg[data_agg$cond=="Negative",]$mean < data_agg[data_agg$cond=="Neutral",]$mean)
## [1] 19

For how many people were the above both TRUE?

sum (data_agg[data_agg$cond=="Negative",]$mean < data_agg[data_agg$cond=="Positive",]$mean & 
       data_agg[data_agg$cond=="Negative",]$mean < data_agg[data_agg$cond=="Neutral",]$mean)
## [1] 16

Let’s do the same but this time by Items.

ggplot (data_R3, aes (x=cond, y=DV, colour=cond)) + geom_jitter(width=.1, alpha=.5) + 
  stat_summary(fun.y=mean, geom="point", colour="black") +
  facet_wrap(~item) +
  scale_color_discrete(guide=FALSE) + 
  labs (y="Reading Time (msec.)", x="Condition",
                title="Critical Region, First Pass Reading Times Facted Wrapped by Item")

data_agg <- data_R3 %>% group_by(item, cond) %>% summarise (mean=mean(DV), sd=sd(DV))

sum (data_agg[data_agg$cond=="Negative",]$mean < data_agg[data_agg$cond=="Positive",]$mean)
## [1] 17
sum (data_agg[data_agg$cond=="Negative",]$mean < data_agg[data_agg$cond=="Neutral",]$mean)
## [1] 18

Let’s build a raincloud plot.

library(RColorBrewer)
library(plyr) #note, need to detach this after this plot as clashes with aspects of dplyr
source("https://gist.githubusercontent.com/ajstewartlang/6c4cd8ab9e0c27747424acdfb3b4cff6/raw/fb53bd97121f7f9ce947837ef1a4c65a73bffb3f/geom_flat_violin.R")
raincloud_theme = theme(
  text = element_text(size = 12),
  axis.title.x = element_text(size = 12),
  axis.title.y = element_text(size = 12),
  axis.text = element_text(size = 12),
  axis.text.x = element_text(angle = 45, vjust = 0.5),
  legend.title=element_text(size=12),
  legend.text=element_text(size=12),
  legend.position = "right",
  #plot.title = element_text(lineheight=.8, size = 12),
  panel.border = element_blank(),
  panel.grid.minor = element_blank(),
  panel.grid.major = element_blank(),
  axis.line.x = element_line(colour = 'black', size=0.5, linetype='solid'),
  axis.line.y = element_line(colour = 'black', size=0.5, linetype='solid'))

lb <- function(x) mean(x) - sd(x)
ub <- function(x) mean(x) + sd(x)

sumld <- ddply(data_R3, ~cond, summarise, mean = mean(DV), median = median(DV), lower = lb(DV), upper = ub(DV))

ggplot(data = data_R3, aes(y = DV, x = cond, fill = cond)) +
  geom_flat_violin(position = position_nudge(x = .2, y = 0), alpha = .8, trim=FALSE) +
  geom_point(aes(y = DV, color = cond), position = position_jitter(width = .15), size = .5, alpha = 0.8) +
  geom_boxplot(width = .1,  outlier.shape = NA, alpha = 0.5) +
  #expand_limits(x = 3) +
  guides(fill = FALSE) +
  guides(color = FALSE) +
  scale_color_brewer(palette = "Accent") +
  scale_fill_brewer(palette = "Accent") +
  coord_flip() +
  theme_bw() +
  raincloud_theme +
  labs(y="Reading Time (msec.)", x="Condition", title="Critical Region, First Pass Reading Times")

detach("package:plyr", unload=TRUE) #need to detach as some clashes with dplyr

Build our first linear mixed model with maximal random effects structure.

model.null <- lmer(DV ~ (1+cond|subj) + (1+cond|item), data_R3, REML=TRUE)
model <- lmer(DV ~ cond + (1+cond|subj) + (1+cond|item), data_R3, REML=TRUE)

Check the experimental model differs from the Null model.

anova(model.null, model)
## refitting model(s) with ML (instead of REML)
## Data: data_R3
## Models:
## model.null: DV ~ (1 + cond | subj) + (1 + cond | item)
## model: DV ~ cond + (1 + cond | subj) + (1 + cond | item)
##            Df    AIC    BIC  logLik deviance Chisq Chi Df Pr(>Chisq)   
## model.null 14 8648.1 8709.0 -4310.0   8620.1                           
## model      16 8642.3 8711.9 -4305.1   8610.3 9.817      2   0.007384 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Generate a summary of our model parameters plus run some pairwise comparisons.

summary(model)
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: DV ~ cond + (1 + cond | subj) + (1 + cond | item)
##    Data: data_R3
## 
## REML criterion at convergence: 8581.6
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -3.0879 -0.5173 -0.0208  0.4988  4.2992 
## 
## Random effects:
##  Groups   Name         Variance Std.Dev. Corr       
##  subj     (Intercept)   66595.9 258.06              
##           condNeutral    2053.1  45.31    1.00      
##           condPositive    147.5  12.14   -0.32 -0.29
##  item     (Intercept)   34565.7 185.92              
##           condNeutral     383.2  19.57    1.00      
##           condPositive   4833.4  69.52   -1.00 -1.00
##  Residual              171365.6 413.96              
## Number of obs: 572, groups:  subj, 24; item, 24
## 
## Fixed effects:
##              Estimate Std. Error     df t value Pr(>|t|)    
## (Intercept)    815.42      71.57  32.26  11.393 7.61e-13 ***
## condNeutral    142.03      43.66 143.18   3.253  0.00142 ** 
## condPositive   104.46      44.81  29.43   2.331  0.02679 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr) cndNtr
## condNeutral -0.086       
## condPositiv -0.464  0.432
emmeans(model, pairwise~cond, adjust="none")
## $emmeans
##  cond       emmean       SE    df lower.CL  upper.CL
##  Negative 815.4171 71.57948 31.36 669.4985  961.3357
##  Neutral  957.4509 80.57012 32.56 793.4451 1121.4566
##  Positive 919.8750 64.45063 26.39 787.4909 1052.2591
## 
## Degrees-of-freedom method: kenward-roger 
## Confidence level used: 0.95 
## 
## $contrasts
##  contrast              estimate       SE    df t.ratio p.value
##  Negative - Neutral  -142.03374 43.67792 11.27  -3.252  0.0075
##  Negative - Positive -104.45787 44.81766 12.18  -2.331  0.0377
##  Neutral - Positive    37.57587 47.18139 13.35   0.796  0.4397

Check we’re happy with the model residuals.

qqnorm(residuals(model))

I can live with these…

Now let’s look at the subsequent region of text (Region 4). Exclude missing data/tracker loss points where the DV==0.

data_R4 <- filter (data, reg=="R4" & DV > 0)

Need to load the plyr package again.

library(plyr)

Another Raincloud plot.

sumld <- ddply(data_R4, ~cond, summarise, mean = mean(DV), median = median(DV), lower = lb(DV), upper = ub(DV))

ggplot(data = data_R4, aes(y = DV, x = cond, fill = cond)) +
  geom_flat_violin(position = position_nudge(x = .2, y = 0), alpha = .8, trim=FALSE) +
  geom_point(aes(y = DV, color = cond), position = position_jitter(width = .15), size = .5, alpha = 0.8) +
  geom_boxplot(width = .1,  outlier.shape = NA, alpha = 0.5) +
  #expand_limits(x = 3) +
  guides(fill = FALSE) +
  guides(color = FALSE) +
  scale_color_brewer(palette = "Accent") +
  scale_fill_brewer(palette = "Accent") +
  coord_flip() +
  theme_bw() +
  raincloud_theme +
  labs(y="Reading Time (msec.)", x="Condition", title="Post-Critical Region, First Pass Reading Times")

detach("package:plyr", unload=TRUE) #need to detach as some clashes with dplyr

Build the linear mixed model for Region 4 as we did for Region 3.

model.null <- lmer(DV ~ (1|subj) + (1|item), data_R4, REML=TRUE)
model <- lmer(DV ~ cond + (1|subj) + (1|item), data_R4, REML=TRUE)

anova (model.null, model)
## refitting model(s) with ML (instead of REML)
## Data: data_R4
## Models:
## model.null: DV ~ (1 | subj) + (1 | item)
## model: DV ~ cond + (1 | subj) + (1 | item)
##            Df    AIC    BIC  logLik deviance  Chisq Chi Df Pr(>Chisq)
## model.null  4 8924.1 8941.5 -4458.1   8916.1                         
## model       6 8923.8 8949.9 -4455.9   8911.8 4.3149      2     0.1156
summary (model)
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: DV ~ cond + (1 | subj) + (1 | item)
##    Data: data_R4
## 
## REML criterion at convergence: 8881.6
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -3.5734 -0.5398 -0.1635  0.4043  5.8942 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev.
##  subj     (Intercept) 122383   349.8   
##  item     (Intercept)  45746   213.9   
##  Residual             300455   548.1   
## Number of obs: 571, groups:  subj, 24; item, 24
## 
## Fixed effects:
##              Estimate Std. Error      df t value Pr(>|t|)    
## (Intercept)   1395.09      92.77   45.32  15.039   <2e-16 ***
## condNeutral     66.92      56.33  522.13   1.188    0.235    
## condPositive   -49.05      56.26  522.14  -0.872    0.384    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr) cndNtr
## condNeutral -0.306       
## condPositiv -0.307  0.505
emmeans (model, pairwise~cond, adjust="none")
## $emmeans
##  cond       emmean       SE    df lower.CL upper.CL
##  Negative 1395.088 92.76705 45.32 1208.282 1581.894
##  Neutral  1462.005 92.62314 45.05 1275.457 1648.552
##  Positive 1346.042 92.57551 44.95 1159.580 1532.504
## 
## Degrees-of-freedom method: kenward-roger 
## Confidence level used: 0.95 
## 
## $contrasts
##  contrast             estimate       SE     df t.ratio p.value
##  Negative - Neutral  -66.91684 56.33498 522.13  -1.188  0.2354
##  Negative - Positive  49.04624 56.26042 522.14   0.872  0.3837
##  Neutral - Positive  115.96308 56.02282 522.04   2.070  0.0390

Looking at the first pass data animated by Region.

data1 <- filter(data, reg!="R0" & DV > 0)

ggplot (data1, aes (x=cond, y=DV, colour=cond)) + geom_jitter(width=.2, alpha=.8, size=3) + 
  geom_boxplot(width = .5,  outlier.shape = NA, alpha = 0.3, colour="black") +
  scale_color_discrete(guide=FALSE) + 
  labs (y="Reading Time (msec.)", x="Condition", 
        title="First Pass Reading Times Animated by Region. \nThe Critical Region is R3. \nCurrent Region is {(closest_state)}") +
  theme(text=element_text(size=15)) +
  coord_flip() +
  transition_states(reg, transition_length = 1, state_length = 1)  + 
  enter_fade() + 
  exit_fade() 

Read in the Regressions Out data.

FPRO <- read_csv("fproro0.ixs")
## Parsed with column specification:
## cols(
##   subj = col_character(),
##   item = col_character(),
##   cond = col_character(),
##   seq = col_integer(),
##   meas = col_character(),
##   reg = col_character(),
##   DV = col_integer()
## )

Recode as before.

data_R3 <- filter (FPRO, reg=="R3" & cond != "C0")
data_R3$cond <- recode (data_R3$cond, "C1"="Neutral", "C2"="Negative", "C3"="Positive")
data_R3$cond <- as.factor (data_R3$cond)

Let’s generate some summary data.

data_summ <- data_R3 %>% group_by(cond) %>% summarise(mean=mean(DV), sd=sd(DV))

Build a bar chart.

ggplot (data_summ, aes (x=cond, y=mean, fill=cond)) + geom_col() + 
  geom_text(aes (label= format(round(mean, 0))), nudge_y = 1) +
  scale_fill_discrete (guide=FALSE) + labs (y="Regressions (%)", x="Condition", 
                                            title="Regressions Out of Critical Region")

Recode the DV as binomial.

data_R3$DV <- recode (data_R3$DV, "100"=1 ,"0"=0)

Build the null and experimental glmm given sampling from the binomial distribution.

model.null <- glmer (DV ~ (1|subj) + (1|item), data_R3, family=binomial)
model <- glmer (DV ~ cond + (1|subj) + (1|item), data_R3, family=binomial)
anova (model.null, model)
## Data: data_R3
## Models:
## model.null: DV ~ (1 | subj) + (1 | item)
## model: DV ~ cond + (1 | subj) + (1 | item)
##            Df    AIC    BIC  logLik deviance  Chisq Chi Df Pr(>Chisq)
## model.null  3 601.97 614.91 -297.98   595.97                         
## model       5 605.70 627.28 -297.85   595.70 0.2617      2     0.8773
summary (model)
## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: binomial  ( logit )
## Formula: DV ~ cond + (1 | subj) + (1 | item)
##    Data: data_R3
## 
##      AIC      BIC   logLik deviance df.resid 
##    605.7    627.3   -297.9    595.7      548 
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -1.3096 -0.6118 -0.4044  0.7636  3.1871 
## 
## Random effects:
##  Groups Name        Variance Std.Dev.
##  subj   (Intercept) 0.8362   0.9144  
##  item   (Intercept) 0.0000   0.0000  
## Number of obs: 553, groups:  subj, 24; item, 24
## 
## Fixed effects:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   -1.3095     0.2647  -4.947 7.55e-07 ***
## condNeutral    0.1093     0.2539   0.431    0.667    
## condPositive   0.1162     0.2515   0.462    0.644    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr) cndNtr
## condNeutral -0.485       
## condPositiv -0.488  0.507

Now look at Regressions Out of the post-critical region (Region 4).

data_R4 <- filter (FPRO, reg=="R4" & cond != "C0")
data_R4$cond <- recode (data_R4$cond, "C1"="Neutral", "C2"="Negative", "C3"="Positive")
data_R4$cond <- as.factor (data_R4$cond)

data_summ <- data_R4 %>% group_by(cond) %>% summarise(mean=mean(DV), sd=sd(DV))

Build a bar chart.

ggplot (data_summ, aes (x=cond, y=mean, fill=cond)) + geom_col() + 
  geom_text(aes (label= format(round(mean, 0))), nudge_y = 1) +
  scale_fill_discrete (guide=FALSE) + labs (y="Regressions (%)", x="Condition",
                                            title="Regressions Out of Post-Critical Region")

Recode DV as binomial.

data_R4$DV <- recode (data_R4$DV, "100"=1 ,"0"=0)

Build the glmms.

model.null <- glmer (DV ~ (1|subj) + (1|item), data_R4, family=binomial)
model <- glmer (DV ~ cond + (1|subj) + (1|item), data_R4, family=binomial)

anova (model.null, model)
## Data: data_R4
## Models:
## model.null: DV ~ (1 | subj) + (1 | item)
## model: DV ~ cond + (1 | subj) + (1 | item)
##            Df    AIC    BIC  logLik deviance  Chisq Chi Df Pr(>Chisq)  
## model.null  3 261.67 274.28 -127.83   255.67                           
## model       5 259.10 280.12 -124.55   249.10 6.5681      2    0.03748 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary (model)
## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: binomial  ( logit )
## Formula: DV ~ cond + (1 | subj) + (1 | item)
##    Data: data_R4
## 
##      AIC      BIC   logLik deviance df.resid 
##    259.1    280.1   -124.6    249.1      490 
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -1.2733 -0.2756 -0.2033 -0.1468  5.2443 
## 
## Random effects:
##  Groups Name        Variance Std.Dev.
##  subj   (Intercept) 1.2797   1.1313  
##  item   (Intercept) 0.1814   0.4259  
## Number of obs: 495, groups:  subj, 24; item, 24
## 
## Fixed effects:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   -3.5034     0.5094  -6.877 6.12e-12 ***
## condNeutral    0.2484     0.5076   0.489   0.6246    
## condPositive   1.0495     0.4607   2.278   0.0227 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr) cndNtr
## condNeutral -0.564       
## condPositiv -0.660  0.605

As model is significant, run some pairwise comparisions to determine where the effect is. Ask for the output on the response scale.

emmeans (model, pairwise~cond, adjust="none", type="response")
## $emmeans
##  cond           prob         SE  df  asymp.LCL  asymp.UCL
##  Negative 0.02921439 0.01444844 Inf 0.01096581 0.07551245
##  Neutral  0.03714549 0.01697488 Inf 0.01498975 0.08908695
##  Positive 0.07914715 0.02932761 Inf 0.03759123 0.15904989
## 
## Confidence level used: 0.95 
## Intervals are back-transformed from the logit scale 
## 
## $contrasts
##  contrast            odds.ratio        SE  df z.ratio p.value
##  Negative / Neutral   0.7800601 0.3959661 Inf  -0.489  0.6246
##  Negative / Positive  0.3501293 0.1612980 Inf  -2.278  0.0227
##  Neutral / Positive   0.4488491 0.1940912 Inf  -1.853  0.0640
## 
## Tests are performed on the log odds ratio scale